library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rtweet)
## 
## Attaching package: 'rtweet'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
library(lubridate)
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library("readxl")
library(ggplot2)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(penalized)
## Welcome to penalized. For extended examples, see vignette("penalized").
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(corrplot)
## corrplot 0.92 loaded
library(reticulate)
library(ggmap)
## ℹ Google's Terms of Service: <]8;;https://mapsplatform.google.comhttps://mapsplatform.google.com]8;;>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggmap':
## 
##     wind
## 
## The following object is masked from 'package:MASS':
## 
##     select
## 
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library('DT')
library(future)
## 
## Attaching package: 'future'
## 
## The following object is masked from 'package:survival':
## 
##     cluster
df <- py$test

names(df)
##   [1] "id"                                           
##   [2] "mlsListingId"                                 
##   [3] "mlsOrgId"                                     
##   [4] "agentId"                                      
##   [5] "agentName"                                    
##   [6] "agentOrganizationName"                        
##   [7] "agentBrePrefix"                               
##   [8] "openHouseSchedule"                            
##   [9] "propertyType"                                 
##  [10] "picturesNum"                                  
##  [11] "tour3D"                                       
##  [12] "tour3Ds"                                      
##  [13] "openHouseFlag"                                
##  [14] "flag"                                         
##  [15] "primaryType"                                  
##  [16] "secondaryType"                                
##  [17] "hoaFee"                                       
##  [18] "listingSource"                                
##  [19] "lotSize"                                      
##  [20] "chimeFirstInserted"                           
##  [21] "hasPriceChanged"                              
##  [22] "builtYear"                                    
##  [23] "oldPrice"                                     
##  [24] "mlsUpdateL"                                   
##  [25] "newListingFlag"                               
##  [26] "backOnMarketFlag"                             
##  [27] "daysOnList"                                   
##  [28] "priceChange"                                  
##  [29] "hasPriceChangedV1"                            
##  [30] "hasStatusChange"                              
##  [31] "mlsListDateL"                                 
##  [32] "mlsListDateLSort"                             
##  [33] "newUpdateTime"                                
##  [34] "openHouseSchedules"                           
##  [35] "openHouseDesc"                                
##  [36] "mlsUpdateTimeL"                               
##  [37] "updateTimeV2"                                 
##  [38] "price"                                        
##  [39] "ceilingPrice"                                 
##  [40] "bedrooms"                                     
##  [41] "bathrooms"                                    
##  [42] "fullBaths"                                    
##  [43] "halfBaths"                                    
##  [44] "quarterBaths"                                 
##  [45] "threeQuarterBaths"                            
##  [46] "sqft"                                         
##  [47] "detailsDescribe"                              
##  [48] "spaceRent"                                    
##  [49] "locationId"                                   
##  [50] "city"                                         
##  [51] "state"                                        
##  [52] "zipCode"                                      
##  [53] "previewPicture"                               
##  [54] "streetAddress"                                
##  [55] "longitude"                                    
##  [56] "latitude"                                     
##  [57] "waterfrontFlag"                               
##  [58] "specialListingCondition"                      
##  [59] "detailUrl"                                    
##  [60] "updateTimeS"                                  
##  [61] "lastPrimaryChangeTime"                        
##  [62] "collectStatus"                                
##  [63] "generateTime"                                 
##  [64] "addressPartialFlag"                           
##  [65] "mlsOrg"                                       
##  [66] "bidLevel"                                     
##  [67] "createDate"                                   
##  [68] "updateDate"                                   
##  [69] "coAgentName"                                  
##  [70] "coAgentOrgName"                               
##  [71] "elementarySchool"                             
##  [72] "middleSchool"                                 
##  [73] "highSchool"                                   
##  [74] "schoolDistrict"                               
##  [75] "oldValue"                                     
##  [76] "monthlyTotalFees"                             
##  [77] "coSellingAgentStatus"                         
##  [78] "coAgentId"                                    
##  [79] "commercialRealEstate"                         
##  [80] "listingPictures"                              
##  [81] "resourceKey"                                  
##  [82] "leadCount"                                    
##  [83] "statusChangeOrSoldDate"                       
##  [84] "mlsListingDate"                               
##  [85] "totalAvailableAcres"                          
##  [86] "location"                                     
##  [87] "taxAmount"                                    
##  [88] "showPreviousNext"                             
##  [89] "previousNextSource"                           
##  [90] "detailLink"                                   
##  [91] "listingStatus"                                
##  [92] "coAgent"                                      
##  [93] "appDetailLink"                                
##  [94] "listingProvided"                              
##  [95] "address"                                      
##  [96] "flagText"                                     
##  [97] "listingStatusText"                            
##  [98] "propertyTypeText"                             
##  [99] "link"                                         
## [100] "communityFeatures"                            
## [101] "petsDescription"                              
## [102] "stories"                                      
## [103] "chimeVideoLink"                               
## [104] "statusUpdateTime"                             
## [105] "priceUpdateTime"                              
## [106] "elementarySchoolV1"                           
## [107] "middleSchoolV1"                               
## [108] "highSchoolV1"                                 
## [109] "schoolDistrictV1"                             
## [110] "Prop_Merge_Address"                           
## [111] "pin"                                          
## [112] "pin10"                                        
## [113] "year"                                         
## [114] "class_x"                                      
## [115] "triad_name"                                   
## [116] "triad_code"                                   
## [117] "township_name"                                
## [118] "township_code"                                
## [119] "nbhd_code"                                    
## [120] "tax_code"                                     
## [121] "tieback_proration_rate"                       
## [122] "lon"                                          
## [123] "lat"                                          
## [124] "x_3435"                                       
## [125] "y_3435"                                       
## [126] "prop_address_full"                            
## [127] "prop_address_city_name"                       
## [128] "prop_address_state"                           
## [129] "prop_address_zipcode_1"                       
## [130] "mail_address_name"                            
## [131] "mail_address_full"                            
## [132] "mail_address_city_name"                       
## [133] "mail_address_state"                           
## [134] "mail_address_zipcode_1"                       
## [135] "census_block_group_geoid"                     
## [136] "census_block_geoid"                           
## [137] "census_congressional_district_geoid"          
## [138] "census_county_subdivision_geoid"              
## [139] "census_place_geoid"                           
## [140] "census_puma_geoid"                            
## [141] "census_school_district_unified_geoid"         
## [142] "census_state_representative_geoid"            
## [143] "census_state_senate_geoid"                    
## [144] "census_tract_geoid"                           
## [145] "census_zcta_geoid"                            
## [146] "census_data_year"                             
## [147] "census_acs5_congressional_district_geoid"     
## [148] "census_acs5_county_subdivision_geoid"         
## [149] "census_acs5_place_geoid"                      
## [150] "census_acs5_puma_geoid"                       
## [151] "census_acs5_school_district_unified_geoid"    
## [152] "census_acs5_state_representative_geoid"       
## [153] "census_acs5_state_senate_geoid"               
## [154] "census_acs5_tract_geoid"                      
## [155] "census_acs5_data_year"                        
## [156] "cook_board_of_review_district_num"            
## [157] "cook_board_of_review_district_data_year"      
## [158] "cook_commissioner_district_num"               
## [159] "cook_commissioner_district_data_year"         
## [160] "cook_judicial_district_num"                   
## [161] "cook_judicial_district_data_year"             
## [162] "cook_municipality_num"                        
## [163] "cook_municipality_name"                       
## [164] "cook_municipality_data_year"                  
## [165] "ward_num"                                     
## [166] "ward_data_year"                               
## [167] "chicago_community_area_num"                   
## [168] "chicago_community_area_name"                  
## [169] "chicago_community_area_data_year"             
## [170] "chicago_police_district_num"                  
## [171] "chicago_police_district_data_year"            
## [172] "env_flood_fema_sfha"                          
## [173] "env_flood_fema_data_year"                     
## [174] "env_flood_fs_factor"                          
## [175] "env_flood_fs_risk_direction"                  
## [176] "env_flood_fs_data_year"                       
## [177] "env_ohare_noise_contour_no_buffer_bool"       
## [178] "env_ohare_noise_contour_half_mile_buffer_bool"
## [179] "env_ohare_noise_contour_data_year"            
## [180] "env_airport_noise_dnl"                        
## [181] "env_airport_noise_data_year"                  
## [182] "school_elementary_district_geoid"             
## [183] "school_elementary_district_name"              
## [184] "school_secondary_district_geoid"              
## [185] "school_secondary_district_name"               
## [186] "school_school_year"                           
## [187] "school_data_year"                             
## [188] "tax_community_college_district"               
## [189] "tax_community_college_district_name"          
## [190] "tax_community_college_district_data_year"     
## [191] "tax_park_district_num"                        
## [192] "tax_park_district_name"                       
## [193] "tax_park_district_data_year"                  
## [194] "access_cmap_walk_id"                          
## [195] "access_cmap_walk_nta_score"                   
## [196] "access_cmap_walk_total_score"                 
## [197] "access_cmap_walk_data_year"                   
## [198] "misc_subdivision_id"                          
## [199] "misc_subdivision_data_year"                   
## [200] "num_pin_in_half_mile"                         
## [201] "num_bus_stop_in_half_mile"                    
## [202] "num_bus_stop_data_year"                       
## [203] "num_foreclosure_in_half_mile_past_5_years"    
## [204] "num_foreclosure_per_1000_pin_past_5_years"    
## [205] "num_foreclosure_data_year"                    
## [206] "num_school_in_half_mile"                      
## [207] "num_school_data_year"                         
## [208] "nearest_bike_trail_dist_ft"                   
## [209] "nearest_bike_trail_data_year"                 
## [210] "nearest_cemetery_gnis_code"                   
## [211] "nearest_cemetery_name"                        
## [212] "nearest_cemetery_dist_ft"                     
## [213] "nearest_cemetery_data_year"                   
## [214] "nearest_cta_route_id"                         
## [215] "nearest_cta_route_name"                       
## [216] "nearest_cta_route_dist_ft"                    
## [217] "nearest_cta_route_data_year"                  
## [218] "nearest_cta_stop_id"                          
## [219] "nearest_cta_stop_name"                        
## [220] "nearest_cta_stop_dist_ft"                     
## [221] "nearest_cta_stop_data_year"                   
## [222] "nearest_golf_course_id"                       
## [223] "nearest_golf_course_dist_ft"                  
## [224] "nearest_golf_course_data_year"                
## [225] "nearest_hospital_gnis_code"                   
## [226] "nearest_hospital_name"                        
## [227] "nearest_hospital_dist_ft"                     
## [228] "nearest_hospital_data_year"                   
## [229] "lake_michigan_dist_ft"                        
## [230] "lake_michigan_data_year"                      
## [231] "nearest_major_road_osm_id"                    
## [232] "nearest_major_road_name"                      
## [233] "nearest_major_road_dist_ft"                   
## [234] "nearest_major_road_data_year"                 
## [235] "nearest_metra_route_id"                       
## [236] "nearest_metra_route_name"                     
## [237] "nearest_metra_route_dist_ft"                  
## [238] "nearest_metra_route_data"                     
## [239] "nearest_metra_stop_id"                        
## [240] "nearest_metra_stop_name"                      
## [241] "nearest_metra_stop_dist_ft"                   
## [242] "nearest_metra_stop_data_year"                 
## [243] "nearest_park_osm_id"                          
## [244] "nearest_park_name"                            
## [245] "nearest_park_dist_ft"                         
## [246] "nearest_park_data_year"                       
## [247] "nearest_railroad_id"                          
## [248] "nearest_railroad_name"                        
## [249] "nearest_railroad_dist_ft"                     
## [250] "nearest_railroad_data_year"                   
## [251] "nearest_water_id"                             
## [252] "nearest_water_name"                           
## [253] "nearest_water_dist_ft"                        
## [254] "nearest_water_data_year"                      
## [255] "nearest_neighbor_1_pin10"                     
## [256] "nearest_neighbor_1_dist_ft"                   
## [257] "nearest_neighbor_2_pin10"                     
## [258] "nearest_neighbor_2_dist_ft"                   
## [259] "nearest_neighbor_3_pin10"                     
## [260] "nearest_neighbor_3_dist_ft"                   
## [261] "nearest_bike_trail_id"                        
## [262] "econ_qualified_opportunity_zone_num"          
## [263] "econ_qualified_opportunity_zone_data_year"    
## [264] "tax_tif_district_num"                         
## [265] "tax_tif_district_name"                        
## [266] "tax_tif_district_data_year"                   
## [267] "tieback_key_pin"                              
## [268] "tax_special_service_area_num"                 
## [269] "tax_special_service_area_name"                
## [270] "tax_special_service_area_data_year"           
## [271] "nearest_bike_trail_name"                      
## [272] "chicago_industrial_corridor_num"              
## [273] "chicago_industrial_corridor_name"             
## [274] "chicago_industrial_corridor_data_year"        
## [275] "census_school_district_elementary_geoid"      
## [276] "census_school_district_secondary_geoid"       
## [277] "census_acs5_school_district_elementary_geoid" 
## [278] "census_acs5_school_district_secondary_geoid"  
## [279] "econ_coordinated_care_area_num"               
## [280] "econ_coordinated_care_area_data_year"         
## [281] "tax_sanitation_district_num"                  
## [282] "tax_sanitation_district_name"                 
## [283] "tax_sanitation_district_data_year"            
## [284] "econ_enterprise_zone_num"                     
## [285] "econ_enterprise_zone_data_year"               
## [286] "econ_industrial_growth_zone_num"              
## [287] "econ_industrial_growth_zone_data_year"        
## [288] "tax_library_district_num"                     
## [289] "tax_library_district_name"                    
## [290] "tax_library_district_data_year"               
## [291] "tax_fire_protection_district_num"             
## [292] "tax_fire_protection_district_name"            
## [293] "tax_fire_protection_district_data_year"       
## [294] "pin2"                                         
## [295] "certified"                                    
## [296] "first_pass"                                   
## [297] "class_y"                                      
## [298] "tax_year"                                     
## [299] "nbhd"                                         
## [300] "hd_sf"                                        
## [301] "town_code"                                    
## [302] "type_resd"                                    
## [303] "apts"                                         
## [304] "ext_wall"                                     
## [305] "roof_cnst"                                    
## [306] "rooms"                                        
## [307] "beds"                                         
## [308] "bsmt"                                         
## [309] "bsmt_fin"                                     
## [310] "heat"                                         
## [311] "oheat"                                        
## [312] "air"                                          
## [313] "frpl"                                         
## [314] "attic_type"                                   
## [315] "attic_fnsh"                                   
## [316] "hbath"                                        
## [317] "tp_plan"                                      
## [318] "tp_dsgn"                                      
## [319] "cnst_qlty"                                    
## [320] "site"                                         
## [321] "gar1_size"                                    
## [322] "gar1_cnst"                                    
## [323] "gar1_att"                                     
## [324] "gar1_area"                                    
## [325] "gar2_size"                                    
## [326] "gar2_cnst"                                    
## [327] "gar2_att"                                     
## [328] "gar2_area"                                    
## [329] "porch"                                        
## [330] "ot_impr"                                      
## [331] "bldg_sf"                                      
## [332] "repair_cnd"                                   
## [333] "multi_code"                                   
## [334] "ncu"                                          
## [335] "pri_est_land"                                 
## [336] "pri_est_bldg"                                 
## [337] "centroid_x"                                   
## [338] "centroid_y"                                   
## [339] "tractce"                                      
## [340] "multi_ind"                                    
## [341] "addr"                                         
## [342] "modeling_group"                               
## [343] "fbath"                                        
## [344] "age"                                          
## [345] "use_1"                                        
## [346] "o_hare_noise"                                 
## [347] "floodplain"                                   
## [348] "near_major_road"                              
## [349] "total_units"                                  
## [350] "age_squared"                                  
## [351] "age_decade"                                   
## [352] "age_decade_squared"                           
## [353] "lot_size_squared"                             
## [354] "improvement_size_squared"                     
## [355] "location_factor"                              
## [356] "garage_indicator"                             
## [357] "pure_market_sale"                             
## [358] "pure_market_filter"                           
## [359] "neigborhood_code_mapping_"                    
## [360] "square_root_of_lot_size"                      
## [361] "square_root_of_age"                           
## [362] "square_root_of_improvement_size"              
## [363] "town_and_neighborhood"                        
## [364] "most_recent_sale_date"                        
## [365] "doc_no"                                       
## [366] "most_recent_sale_price"                       
## [367] "deed_type"                                    
## [368] "n_units"                                      
## [369] "per_ass"                                      
## [370] "condo_class_factor"                           
## [371] "residential_share_of_building"                
## [372] "condition_desirability_and_utility"           
## [373] "condo_strata"                                 
## [374] "multi_family_ind"                             
## [375] "renovation"                                   
## [376] "total_bldg_sf"
df$longitude = as.numeric(df$longitude) * 100000
df$latitude = as.numeric(df$latitude) * 100000

df$id <-  as.factor(df$id)

df <- df %>%
  filter(primaryType == 'Residential', sqft != 0,
         builtYear > 0) %>%
  mutate(byexp = (2023 - builtYear) ** 2) %>%
  distinct(id, .keep_all = TRUE)


ggplot(data = df, aes(x = price)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_cols <- df %>%
  dplyr::select(price, builtYear, bedrooms, bathrooms, sqft, longitude, latitude)

corrplot.mixed(corr = cor(df_cols, use = 'pairwise.complete.obs'), order = 'AOE')

lin_mod = lm(price ~ 1 + builtYear + bedrooms + certified +
               bathrooms + sqft + longitude +
               latitude ,data = df)

rlin_mod = rlm(price ~ 1 + builtYear + bedrooms + certified +
               bathrooms + sqft + longitude +
               latitude ,data = df, psi = psi.bisquare)
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
summary(lin_mod)
## 
## Call:
## lm(formula = price ~ 1 + builtYear + bedrooms + certified + bathrooms + 
##     sqft + longitude + latitude, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1969132   -79346     5100    71928  8812620 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.693e+06  6.754e+06  -0.251 0.802049    
## builtYear    4.511e+02  1.267e+02   3.559 0.000376 ***
## bedrooms    -3.068e+04  5.777e+03  -5.310 1.15e-07 ***
## certified    8.704e-01  1.150e-02  75.669  < 2e-16 ***
## bathrooms    8.415e+04  8.099e+03  10.390  < 2e-16 ***
## sqft         1.626e+02  7.201e+00  22.582  < 2e-16 ***
## longitude    5.872e-01  8.825e-01   0.665 0.505839    
## latitude     1.376e+00  6.079e-01   2.264 0.023635 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274500 on 4088 degrees of freedom
##   (409 observations deleted due to missingness)
## Multiple R-squared:  0.8362, Adjusted R-squared:  0.8359 
## F-statistic:  2981 on 7 and 4088 DF,  p-value: < 2.2e-16
summary(rlin_mod)
## 
## Call: rlm(formula = price ~ 1 + builtYear + bedrooms + certified + 
##     bathrooms + sqft + longitude + latitude, data = df, psi = psi.bisquare)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1362534   -41926     3727    51310 10115656 
## 
## Coefficients:
##             Value         Std. Error    t value      
## (Intercept) -1.529534e+07  1.835088e+06 -8.334900e+00
## builtYear    2.540033e+02  3.443450e+01  7.376400e+00
## bedrooms    -2.739048e+03  1.569707e+03 -1.744900e+00
## certified    8.680000e-01  3.100000e-03  2.777364e+02
## bathrooms    4.577282e+04  2.200640e+03  2.079980e+01
## sqft         3.648750e+01  1.956500e+00  1.864930e+01
## longitude   -1.296100e+00  2.398000e-01 -5.405700e+00
## latitude     8.159000e-01  1.652000e-01  4.940200e+00
## 
## Residual standard error: 68630 on 4088 degrees of freedom
##   (409 observations deleted due to missingness)
plot(lin_mod)

df$pred <- predict(lin_mod, df)
df$pp <- df$price

lin_mod$coefficients
##   (Intercept)     builtYear      bedrooms     certified     bathrooms 
## -1.693325e+06  4.510923e+02 -3.067961e+04  8.703784e-01  8.415230e+04 
##          sqft     longitude      latitude 
##  1.626081e+02  5.871929e-01  1.376165e+00
df <- py$test
df$id <-  as.factor(df$id)


df$longitude = as.numeric(df$longitude) * 100000
df$latitude = as.numeric(df$latitude) * 100000


df <- df %>%
  filter(primaryType == 'Residential', sqft != 0,
         builtYear > 0) %>%
  mutate(byexp = (2023 - builtYear) ** 2) %>%
  distinct(id, .keep_all = TRUE)

df_cols <- df %>%
  dplyr::select(price, builtYear, bedrooms, bathrooms, sqft, longitude, latitude, certified,
                o_hare_noise, pri_est_land, pri_est_bldg)
df_cols$o_hare_noise <-as.double(df_cols$o_hare_noise)
df_cols$pri_est_bldg <-as.double(df_cols$pri_est_bldg)
df_cols$pri_est_land <-as.double(df_cols$pri_est_land)



lin_mod = rlm(price ~ 1 + builtYear + bedrooms + certified +
               bathrooms + sqft + longitude +
               latitude + o_hare_noise + pri_est_land + pri_est_bldg, data = df_cols)

summary(lin_mod)
## 
## Call: rlm(formula = price ~ 1 + builtYear + bedrooms + certified + 
##     bathrooms + sqft + longitude + latitude + o_hare_noise + 
##     pri_est_land + pri_est_bldg, data = df_cols)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1596299   -48350     1430    46690  9716951 
## 
## Coefficients:
##              Value         Std. Error    t value      
## (Intercept)  -1.099033e+07  2.139145e+06 -5.137700e+00
## builtYear     2.405557e+02  3.893600e+01  6.178200e+00
## bedrooms     -1.098814e+04  1.739843e+03 -6.315600e+00
## certified     1.018800e+00  6.600000e-03  1.553726e+02
## bathrooms     4.667789e+04  2.458262e+03  1.898820e+01
## sqft          5.957970e+01  2.261000e+00  2.635160e+01
## longitude    -8.474000e-01  2.764000e-01 -3.065900e+00
## latitude      7.254000e-01  1.834000e-01  3.955600e+00
## o_hare_noise -2.637882e+04  1.256847e+04 -2.098800e+00
## pri_est_land  3.894000e-01  3.080000e-02  1.263440e+01
## pri_est_bldg -1.213000e-01  7.500000e-03 -1.609760e+01
## 
## Residual standard error: 70320 on 4052 degrees of freedom
##   (442 observations deleted due to missingness)
df_cols$pred <- predict(lin_mod, df_cols)
df_cols$pp <- df$price
df_cols$PriceOverMarket <- df_cols$price - df_cols$pred
df$o_hare_noise <-as.double(df$o_hare_noise)
df$pri_est_bldg <-as.double(df$pri_est_bldg)
df$pri_est_land <-as.double(df$pri_est_land)

bootstrapping <- function(df) {
  df <- df
  
  sampledRows <- sample(1:nrow(df), nrow(df), replace = TRUE)
  
  df <- df[sampledRows, ]
  
  bsMod <- rlm(price ~ 1 + builtYear + bedrooms + certified +
               bathrooms + sqft + longitude +
               latitude + o_hare_noise + pri_est_land + pri_est_bldg, data = df)
  results <- broom::tidy(bsMod)
  
  return(results)
}

bs_test <- future(replicate(1000, bootstrapping(df), 
                   simplify = FALSE))

bsCombined <- do.call("rbind", value(bs_test))
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps

## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning: UNRELIABLE VALUE: Future ('<none>') unexpectedly generated random
## numbers without specifying argument 'seed'. There is a risk that those random
## numbers are not statistically sound and the overall results might be invalid.
## To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random
## numbers are produced via the L'Ecuyer-CMRG method. To disable this check, use
## 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore".
hist(bsCombined$statistic[bsCombined$term == "bathrooms"], 
     col = "black")

df <- py$test
df$id <-  as.factor(df$id)


df$longitude = as.numeric(df$longitude) * 100000
df$latitude = as.numeric(df$latitude) * 100000


df <- df %>%
  filter(primaryType == 'Residential', sqft != 0,
         builtYear > 0) %>%
  mutate(byexp = (2023 - builtYear) ** 2) %>%
  distinct(id, .keep_all = TRUE)


df$o_hare_noise <-as.double(df$o_hare_noise)
df$pri_est_bldg <-as.double(df$pri_est_bldg)
df$pri_est_land <-as.double(df$pri_est_land)



lin_mod = rlm(price ~ 1 + builtYear + bedrooms + certified +
               bathrooms + sqft + longitude +
               latitude + o_hare_noise + pri_est_land + pri_est_bldg, data = df)

summary(lin_mod)
## 
## Call: rlm(formula = price ~ 1 + builtYear + bedrooms + certified + 
##     bathrooms + sqft + longitude + latitude + o_hare_noise + 
##     pri_est_land + pri_est_bldg, data = df)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1596299   -48350     1430    46690  9716951 
## 
## Coefficients:
##              Value         Std. Error    t value      
## (Intercept)  -1.099033e+07  2.139145e+06 -5.137700e+00
## builtYear     2.405557e+02  3.893600e+01  6.178200e+00
## bedrooms     -1.098814e+04  1.739843e+03 -6.315600e+00
## certified     1.018800e+00  6.600000e-03  1.553726e+02
## bathrooms     4.667789e+04  2.458262e+03  1.898820e+01
## sqft          5.957970e+01  2.261000e+00  2.635160e+01
## longitude    -8.474000e-01  2.764000e-01 -3.065900e+00
## latitude      7.254000e-01  1.834000e-01  3.955600e+00
## o_hare_noise -2.637882e+04  1.256847e+04 -2.098800e+00
## pri_est_land  3.894000e-01  3.080000e-02  1.263440e+01
## pri_est_bldg -1.213000e-01  7.500000e-03 -1.609760e+01
## 
## Residual standard error: 70320 on 4052 degrees of freedom
##   (442 observations deleted due to missingness)
df$pred <- predict(lin_mod, df)
df$pp <- df$price
df$PriceOverMarket <- df$price - df$pred

plot(lin_mod)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced